Loading Libraries

library(ggplot2)
library(maps)
library(ggrepel)
library(tidyverse)
library(dplyr)
library(tidyr)
library(corrplot)
library(gridExtra)
library(plotly)
library(factoextra)
library(psych)
library(GGally)

Loading data

air=read.csv('indian_weather_data.csv')
head(air)
# making numerical df
num_df=air[c("lat","lon","temperature","weather_code","co","no2","o3","so2","pm2_5","pm10","wind_speed","wind_degree","pressure",    "precip","humidity","cloudcover","feelslike","visibility" )]
num_df=data.frame(num_df)

# making categorical df
cat_df=air[c("city","sunrise","sunset","moonrise","moonset","wind_dir")]

Data Handling

colSums(is.na(air))
##         city          lat          lon  temperature weather_code      sunrise 
##            0            0            0            0            0            0 
##       sunset     moonrise      moonset           co          no2           o3 
##            0            0            0            0            0            0 
##          so2        pm2_5         pm10   wind_speed  wind_degree     wind_dir 
##            0            0            0            0            0            0 
##     pressure       precip     humidity   cloudcover    feelslike     uv_index 
##            0            0            0            0            0            0 
##   visibility 
##            0

PCA: Principal Component Analysis

pca_fit <- prcomp(num_df, scale=TRUE)
pca_summary<-summary(pca_fit)

importance_matrix<-pca_summary$importance

# Convert to data frame
pca_df <- as.data.frame(importance_matrix)
pca_df
# Rotating the factors
rotation_df<-as.data.frame(pca_fit$rotation[,0:5])
rotation_df
fviz_pca_biplot(pca_fit, 
                repel = TRUE,  # Prevents text overlapping
                col.var = "blue",    # Variables color
                col.ind = "gray",    # Individuals color
                title = "PCA Biplot")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Factor Analysis

KMO and Bartlett’s Test of Sphericity

# making a correlation matrix
correlation_matrix<-cor(num_df)

#KMO Test
kmo_result <- KMO(correlation_matrix)
print(kmo_result)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = correlation_matrix)
## Overall MSA =  0.69
## MSA for each item = 
##          lat          lon  temperature weather_code           co          no2 
##         0.85         0.54         0.72         0.65         0.83         0.35 
##           o3          so2        pm2_5         pm10   wind_speed  wind_degree 
##         0.82         0.78         0.75         0.74         0.56         0.34 
##     pressure       precip     humidity   cloudcover    feelslike   visibility 
##         0.79         0.35         0.56         0.81         0.69         0.49
# Bartlett's Test of Sphericity 
cortest.bartlett(correlation_matrix, n = nrow(num_df))
## $chisq
## [1] 1873.867
## 
## $p.value
## [1] 1.449223e-293
## 
## $df
## [1] 153
  1. KMO Test : we can observe the overall MSA value is greater than 0.69 indicating that correlation matrix is not identity matrix.
  2. Bartlett’s Test of Sphericity : We can Observe that the p-value is 1.449223e-293 <0.05 indicating that the data is suitable for factor analysis

Deciding number of factors

scree(num_df, factors = FALSE, pc = TRUE, 
      main = "Scree Plot for Factor Analysis")

fa<- fa(num_df, nfactors=5,rotate="varimax",scores="regression")
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully
fa
## Factor Analysis using method =  minres
## Call: fa(r = num_df, nfactors = 5, rotate = "varimax", scores = "regression")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                MR1   MR2   MR3   MR5   MR4    h2      u2 com
## lat           0.48  0.70 -0.21 -0.23 -0.12 0.840  0.1599 2.3
## lon          -0.10  0.61  0.32  0.46  0.16 0.720  0.2796 2.7
## temperature  -0.20 -0.97 -0.06 -0.08  0.03 0.985  0.0150 1.1
## weather_code  0.24  0.08  0.63  0.27 -0.10 0.543  0.4571 1.8
## co            0.93  0.21  0.15  0.02 -0.03 0.927  0.0731 1.2
## no2           0.45 -0.02 -0.03  0.00  0.66 0.632  0.3683 1.8
## o3            0.73 -0.10  0.23 -0.29 -0.13 0.693  0.3071 1.7
## so2           0.86  0.14  0.10 -0.09  0.16 0.810  0.1903 1.2
## pm2_5         0.94  0.18  0.18 -0.01 -0.09 0.950  0.0502 1.2
## pm10          0.93  0.18  0.19 -0.01 -0.10 0.936  0.0644 1.2
## wind_speed   -0.39 -0.27  0.00  0.09  0.78 0.833  0.1668 1.8
## wind_degree  -0.08  0.14  0.07 -0.08  0.20 0.074  0.9257 2.9
## pressure      0.19  0.87  0.10 -0.13 -0.01 0.812  0.1880 1.2
## precip       -0.12 -0.14  0.14  0.96 -0.07 0.986  0.0140 1.1
## humidity      0.05  0.30  0.76  0.30  0.17 0.785  0.2154 1.8
## cloudcover    0.56  0.28 -0.35  0.16 -0.12 0.561  0.4392 2.5
## feelslike    -0.14 -0.99 -0.01  0.01  0.04 1.005 -0.0049 1.0
## visibility   -0.24  0.10 -0.79  0.19 -0.04 0.739  0.2611 1.4
## 
##                        MR1  MR2  MR3  MR5  MR4
## SS loadings           5.01 3.97 2.07 1.55 1.23
## Proportion Var        0.28 0.22 0.11 0.09 0.07
## Cumulative Var        0.28 0.50 0.61 0.70 0.77
## Proportion Explained  0.36 0.29 0.15 0.11 0.09
## Cumulative Proportion 0.36 0.65 0.80 0.91 1.00
## 
## Mean item complexity =  1.6
## Test of the hypothesis that 5 factors are sufficient.
## 
## df null model =  153  with the objective function =  28.32 with Chi Square =  1873.87
## df of  the model are 73  and the objective function was  10.23 
## 
## The root mean square of the residuals (RMSR) is  0.04 
## The df corrected root mean square of the residuals is  0.06 
## 
## The harmonic n.obs is  74 with the empirical chi square  38.28  with prob <  1 
## The total n.obs was  74  with Likelihood Chi Square =  642.77  with prob <  4.8e-92 
## 
## Tucker Lewis Index of factoring reliability =  0.266
## RMSEA index =  0.324  and the 90 % confidence intervals are  0.304 0.35
## BIC =  328.58
## Fit based upon off diagonal values = 0.99

Factors are:

  1. Factor 1: positive effect: co, o3, so2, pm2_5,pm10 negative effect: lon, temprature

  2. Factor 2: positive effect: lat,lon,pressure negative effect: temperature,feelslike

  3. Factor 3: positive effect: wether_code,humidity negative effect: visibility

  4. Factor 4: positive effect: no2,wind_speed

  5. Factor 5: positive effect: precip

  • Factor Analysis is used for identifying the underlying structure of the data. It helps in reducing variable and these obtained factors can be effectively used for EDA.

  • The First Factor can be named as pollutants, Second Factor as geographic_cond, Third Factor is weather_cond , Fourth can be named as ozone since higher wind speed decreases no2 concentration and fifth can be named precipitation

Data Engineering

Adding FA Score / Factors for EDA

# Storing FA Scores as df 
fa_scores<-as.data.frame(fa$scores)

# renaming column names
colnames(fa_scores)=c("pollutants","geographic_cond","weather_cond","ozone","precipitation")

# concatenating 2 dfs air and fa_scores
df<-cbind(air,fa_scores)
head(df)

Converting Weather code and visibility to categorical variables

df<-df %>% 
  mutate(weather_cat=case_when(
    weather_code==113~"sunny",
    weather_code==122~"partly cloudly",
    weather_code==143~"mist",
    weather_code==116~"moderate rain",
    weather_code==119~"showers",
    weather_code==248~"fog",
    weather_code==176~"moderate rain",
  )) %>% 
  mutate(visibility_cat=case_when(
    visibility<1~"very poor",
    between(visibility,1,3) ~"poor",
    between(visibility,3,5) ~"moderate",
    visibility>5 ~"good"
  )) %>% 
  mutate(parts_of_India = case_when(
    between(lat, 28, 37.6) & between(lon, 68.7, 97.25) ~ "North",
    between(lat, 15, 28) & between(lon, 68, 78) ~ "West",
    between(lat, 20, 28) & between(lon, 83, 97.25) ~ "East",
    lat < 20 & between(lon, 74, 84) ~ "South",
    between(lat, 18, 26) & between(lon, 74, 85) ~ "Central",
    between(lat, 22, 28) & between(lon, 89, 97.25) ~ "Northeast",
    TRUE ~ "Other Region"
  ))

Exploratory Data Analysis

Correlation among variables

# Calculate correlation with pairwise complete observations
weather_cor <- cor(num_df,
                   use = "pairwise.complete.obs")

c_color<- colorRampPalette(c("hotpink", "lightgreen","darkblue"))  

corrplot(weather_cor, 
         method = "color",
         title = "Weather Variables Correlation",
         order="hclust",
         col=c_color(10),
         tl.col="black"
         )

* Positive Correlation: pollutants are highly correlated such as pm2_5, pm_10, co, so2 , no2 is moderately correlated, feelslike and temperature

  • Moderate Correlation (positive and negative): We can see moderate correlation between co2 and weather code,longitude with temperature and feelslike,latitude with temprature and feelslike

  • Negative Correlation:pressure with feelslike and temprature

Temprature according to latitude and longitude values

world<-map_data("world")

#getting the map for india 
india<-subset(world,region=="India")


ggplot()+
   geom_polygon(data=india,
                aes(x=long,y=lat,group=group))+
    geom_point(data=df,
             aes(x=lon,y=lat,color=pm2_5))+
  scale_color_continuous(
    type = "viridis",  # Or "gradient"
    name = "pm 2.5"
  )+
  scale_x_log10()

# setting theme for all plots
set_theme(theme_minimal()+
            theme( 
                plot.title=
                  element_text(
                    size=rel(2)),
                
                panel.background = 
                  element_rect(color="black"),
                
              ))

Top 20 cities with worst air quality pm10 and pm2

# pm10
df %>% 
  arrange(desc(pm10)) %>% 
  select(city,pm10,wind_dir) %>% 
  slice_head(n=20) %>% 
  ggplot(
    aes(x=reorder(city,-pm10),y=pm10,fill=wind_dir)
  )+
  labs(
    title="Highest pm10 Vs Cities and their wind direction",
    x="Cities",
    y="PM 10"
  ) +
  geom_bar(stat="identity")+
  theme(
      axis.text.x = 
                  element_text(angle=45,
                                hjust=1,
                               face="bold")
  )

df %>% 
  arrange(desc(pm2_5)) %>% 
  select(city,pm2_5,wind_dir) %>% 
  slice_head(n=20) %>% 
  ggplot(
    aes(x=reorder(city,-pm2_5),y=pm2_5,fill=wind_dir)
  )+
  geom_bar(stat="identity")+
  labs(
    title="Highest pm 2.5 Vs Cities and their wind direction",
    x="Cities",
    y="PM 2.5"
  )+
  theme(
      axis.text.x = 
                  element_text(angle=45,
                                hjust=1,
                               face="bold")
  )

#One-on-one relationships between two continuous variables ## Temperature vs PM2.5 levels

# temperature Vs pm2.5 levels
plot1<-df %>% 
  ggplot(aes(temperature,pm2_5,size=visibility))+
  geom_point(color="orange")+
  labs(
    title="Temperature Vs pm 2.5",
    subtitle="There influence on Visibility",
    x="Temperature",
    y="pm 2.5"
  )+
  theme(
    aspect.ratio = 1
  )
plot1

Temptrature Vs Pressure

# temperature Vs pm2.5 levels
plot2<-df %>% 
  ggplot(aes(temperature,pressure,size=visibility))+
  geom_point(color="pink")+
  labs(
    title="Temperature Vs Pressure",
    subtitle="There influence on Visibility",
    x="Temperature",
    y="Pressure"
  )+
  theme(
    aspect.ratio = 1
  )

plot2

# Observing 3 variables Wind speed , Temprature and pm 2.5 concentration

fig <- plot_ly(df, x = ~wind_speed, y = ~temperature, z = ~pm2_5, color = ~city)
fig
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(max(N, 3L), "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(max(N, 3L), "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

Clustering based on pollutants

Making dataset for clustering

# Data for clustering
k_data<-df %>% 
  select("city","pollutants","weather_cond","geographic_cond")

Selecting number of clusters

fviz_nbclust(k_data[,c("pollutants","weather_cond","geographic_cond")], kmeans, method = "wss") + 
  ggtitle("Elbow Method")

fviz_nbclust(k_data[,c("pollutants","weather_cond","geographic_cond")], kmeans, method = "silhouette") + 
  ggtitle("Silhouette Method")

## Performing K-means clustering

# Perform k-means clustering (e.g., 4 clusters)
set.seed(123)
kmeans_result <- kmeans(k_data[,c("pollutants","weather_cond","geographic_cond")], centers = 4, nstart = 25)
print(kmeans_result)
## K-means clustering with 4 clusters of sizes 3, 13, 46, 12
## 
## Cluster means:
##   pollutants weather_cond geographic_cond
## 1 -1.2598530   -0.1369784       3.3428137
## 2  1.3139388   -0.9437692       0.3596657
## 3 -0.5321342   -0.1300377      -0.4335428
## 4  0.9313775    1.5551392       0.4365730
## 
## Clustering vector:
##  [1] 4 3 4 3 3 3 3 3 3 2 4 4 3 3 3 4 4 4 3 2 2 4 4 4 4 2 2 2 4 2 2 2 1 1 3 3 3 3
## [39] 3 3 2 3 3 3 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 1 3 3
## 
## Within cluster sum of squares by cluster:
## [1]  9.656581  6.247595 44.417537 16.614590
##  (between_SS / total_SS =  64.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Storing Clusters as factor

k_data$cluster <- as.factor(kmeans_result$cluster)

k_data %>%
  mutate(cluster_name = case_when(
    cluster == 1 ~ "Extremely Clean, Very Warm",
    cluster== 2 ~ "Outlier City",
    cluster== 3 ~ "Clean, Bad Weather",
    cluster== 4 ~ "Very Polluted, Bad Weather",
    cluster== 5 ~ "Polluted, Good Weather",
    TRUE ~ "Unknown"
  ))

Visualizing clusters

city_names<-k_data$city
fviz_cluster(kmeans_result, data = k_data[,c("pollutants","weather_cond","geographic_cond")],
             palette = "Set2", ggtheme = theme_minimal(),
             geom = "point") +
  geom_text(aes(label = city_names), 
            check_overlap = TRUE, 
            size = 3, 
            vjust = -0.5)+
 labs(
   title="Clusters Visualization"
 )+
  theme(
    plot.title = 
      element_text(face = "bold",
                   size=rel(2)),
    panel.background = 
                  element_rect(color="black")
  )+
  scale_fill_discrete(labels = c("A", "B", "C","D","E"))
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.